Display system information and load tidyverse and faraway packages
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.2 magrittr_1.5 tools_3.6.2 htmltools_0.4.0
## [5] yaml_2.2.1 Rcpp_1.0.4.6 stringi_1.4.6 rmarkdown_2.1
## [9] knitr_1.28 stringr_1.4.0 xfun_0.13 digest_0.6.25
## [13] rlang_0.4.5 evaluate_0.14
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.0 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(faraway)
Given a regressor/predictor \(x_1,\ldots,x_n\) and response \(y_1, \ldots, y_n\), where \[ y_i = f(x_i) + \epsilon_i, \] where \(\epsilon_i\) are iid with mean zero and unknown variance \(\sigma^2\).
Parametric approach assumes that \(f(x)\) belongs to a parametric family \(f(x \mid \boldsymbol{\beta})\). Examples are \[\begin{eqnarray*} f(x \mid \boldsymbol{\beta}) &=& \beta_0 + \beta_1 x \\ f(x \mid \boldsymbol{\beta}) &=& \beta_0 + \beta_1 x + \beta_2 x^2 \\ f(x \mid \boldsymbol{\beta}) &=& \beta_0 + \beta_1 x^{\beta_2}. \end{eqnarray*}\]
Nonparametric approach assumes \(f\) is from some smooth family of functions.
Three datasets.
exa: \(f(x) = \sin^3 (2 \pi x^3)\).exa <- as_tibble(exa) %>% print()
## # A tibble: 256 x 3
## x y m
## <dbl> <dbl> <dbl>
## 1 0.00480 -0.0339 0
## 2 0.0086 0.165 0
## 3 0.0117 0.0245 0
## 4 0.017 0.178 0
## 5 0.0261 -0.347 0
## 6 0.0299 -0.755 0
## 7 0.0307 0.355 0
## 8 0.0315 0.0401 0
## 9 0.0331 0.106 0
## 10 0.034 0.122 0
## # … with 246 more rows
exa %>%
ggplot(mapping = aes(x = x, y = y)) +
geom_point() +
geom_smooth(span = 0.2) + # small span give more wiggleness
geom_line(mapping = aes(x = x, y = m))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
exb: \(f(x) = 0\).exb <- as_tibble(exb) %>% print()
## # A tibble: 50 x 3
## x y m
## <dbl> <dbl> <dbl>
## 1 0.02 0.781 0
## 2 0.04 1.64 0
## 3 0.06 -0.363 0
## 4 0.08 -0.0540 0
## 5 0.1 -0.804 0
## 6 0.12 0.907 0
## 7 0.14 -0.843 0
## 8 0.16 -0.229 0
## 9 0.18 -0.499 0
## 10 0.2 1.20 0
## # … with 40 more rows
exb %>%
ggplot(mapping = aes(x = x, y = y)) +
geom_point() +
geom_smooth() +
geom_line(mapping = aes(x = x, y = m))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
faithful: data on Old Faithful geyser in Yellowstone National Park.faithful <- as_tibble(faithful) %>% print()
## # A tibble: 272 x 2
## eruptions waiting
## <dbl> <dbl>
## 1 3.6 79
## 2 1.8 54
## 3 3.33 74
## 4 2.28 62
## 5 4.53 85
## 6 2.88 55
## 7 4.7 88
## 8 3.6 85
## 9 1.95 51
## 10 4.35 85
## # … with 262 more rows
faithful %>%
ggplot(mapping = aes(x = eruptions, y = waiting)) +
geom_point() +
geom_smooth() # small span give more wiggleness
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Moving average estimator \[ \hat f_\lambda(x) = \frac{1}{n\lambda} \sum_{j=1}^n K\left( \frac{x-x_j}{\lambda} \right) Y_j = \frac{1}{n} \sum_{j=1}^n w_j Y_j, \] where \[ w_j = \lambda^{-1} K\left( \frac{x-x_j}{\lambda} \right), \] and \(K\) is a kernel such that \(\int K(x) \, dx = 1\). \(\lambda\) is called the bandwidth, window width or smoothing parameter.
Asymptotics of kernel estimators \[ \text{MSE}(x) = \mathbb{E} [f(x) - \hat f_\lambda(x)]^2 = O(n^{-4/5}). \] Typical parametric estimator has \(\text{MSE}(x) = O(n^{-1})\) if the parametric model is correct.
Choice of kernel. Ideal kernel is smooth, compact, and amenable to rapid computation. The optimal choice under some standard assumptions is the Epanechnikov kernel \[ K(x) = \begin{cases} \frac 34 (1 - x^2) & |x| < 1 \\ 0 & \text{otherwise} \end{cases}. \]
for (bw in c(0.1, 0.5, 2)) {
with(faithful, {
plot(waiting ~ eruptions, col = gray(0.75))
lines(ksmooth(eruptions, waiting, "normal", bw))
})
}
library(sm)
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
with(faithful,
sm.regression(eruptions, waiting, h = h.select(eruptions, waiting)))
with(exa, sm.regression(x, y, h = h.select(x, y)))
with(exb, sm.regression(x, y, h = h.select(x, y)))
We choose \(\hat f\) to minize the modified least squares criterion \[ \frac 1n \sum_i [y_i - f(x_i)]^2 + \lambda \int [f''(x)]^2 \, dx, \] where \(\lambda > 0\) is the smoothing paramter and \(\int [f''(x)]^2 \, dx\) is a roughness penalty. For large \(\lambda\), the minimizer \(\hat f\) is smoother; for smaller \(\lambda\), the minizer \(\hat f\) is rougher. This is the smoothing spline fit.
The minimizer takes a special form: \(\hat f\) is a cubic spline (piecewise cubic polynomial in each interval \((x_i, x_{i+1})\)).
The tuning parameter \(\lambda\) is chosen by cross-validation in R.
with(faithful, {
plot(waiting ~ eruptions, col = gray(0.75))
lines(smooth.spline(eruptions, waiting), lty = 2)
})
with(exa, {
plot(y ~ x, col = gray(0.75))
lines(x, m) # true model
lines(smooth.spline(x, y), lty = 2)
})
with(exb, {
plot(y ~ x, col = gray(0.75))
lines(x, m) # true model
lines(smooth.spline(x, y), lty = 2)
})
The regresison spline fit differs from the smoothing splines in that the number of knots can be much smaller than the sample size.
Piecewise linear splines:
rhs <- function(x, c) ifelse(x > c, x - c, 0)
curve(rhs(x, 0.5), 0, 1)
(knots <- 0:9 / 10)
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9
and compute a design matrix of splines with knots at these points for each \(x\):
dm <- outer(exa$x, knots, rhs)
matplot(exa$x, dm, type = "l", col = 1, xlab = "x", ylab="")
lmod <- lm(exa$y ~ dm)
plot(y ~ x, exa, col = gray(0.75))
lines(exa$x, predict(lmod))
newknots <- c(0, 0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95)
dmn <- outer(exa$x, newknots, rhs)
lmod <- lm(exa$y ~ dmn)
plot(y ~ x, data = exa, col = gray(0.75))
lines(exa$x, predict(lmod))
bs() function can be used to generate the appropriate spline basis. The default is cubic B-splines.library(splines)
matplot(bs(seq(0, 1, length = 1000), df = 12), type = "l", ylab="", col = 1)
lmod <- lm(y ~ bs(x, 12), exa)
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa)
lines(predict(lmod) ~ x, exa, lty = 2)
Both kernel and spline smoothers are vulnerable to outliers.
Local polynomial method fit a polynomial in a window using robust methods, then the predicted response at the middle of the window is the fitted value. This procedure is repeated by sliding the window over the range of the data. lowess or loess functions in R.
We need to choose the order of polynomial and window width. Default window width is 3/4 of data. Default polynomial order is 1 (linear).
Examples.
with(faithful, {
plot(waiting ~ eruptions, col = gray(0.75))
f <- loess(waiting ~ eruptions)
i <- order(eruptions)
lines(f$x[i], f$fitted[i])
})
with(exa, {
plot(y ~ x, col = gray(0.75))
lines(m ~ x)
f <- loess(y ~ x)
lines(f$x, f$fitted, lty = 2)
# try smaller span (proportion of the range)
f <- loess(y ~ x, span = 0.22)
lines(f$x, f$fitted, lty = 5)
})
with(exb, {
plot(y ~ x, col = gray(0.75))
lines(m ~ x)
f <- loess(y ~ x)
lines(f$x, f$fitted, lty = 2)
# splan = 1 means whole span of data (smoothest)
f <- loess(y ~ x, span = 1)
lines(f$x, f$fitted, lty = 5)
})
ggplot(data = exa, mapping = aes(x = x, y = y)) +
geom_point(alpha = 0.25) +
geom_smooth(method = "loess", span = 0.22) +
geom_line(mapping = aes(x = x, y = m), linetype = 2)
## `geom_smooth()` using formula 'y ~ x'
mgcv package.library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
ggplot(data = exa, mapping = aes(x = x, y = y)) +
geom_point(alpha = 0.25) +
geom_smooth(method = "gam", span = 0.22, formula = y ~ s(x, k = 20)) +
geom_line(mapping = aes(x = x, y = m), linetype = 2)
In general, we approximate a curve by a family of basis functions \[ \hat f(x) = \sum_i c_i \phi_i(x), \] where the basis functions \(\phi_i\) are given and the coefficients \(c_i\) are estimated.
Ideally we would like the basis functions \(\phi_i\) to be (1) compactly supported (adatped to local data points) and (2) orthogonal (fast computing).
Orthogonal polynomials and Fourier basis are not compactly supported.
Cubic B-splines are compactly supported but not orthogonal.
Wavelets are both compactly supported and orthogonal.
Haar basis. The mother wavelet function on [0, 1] \[ w(x) = \begin{cases} 1 & x \le 1/2 \\ -1 & x > 1/2 \end{cases}. \] Next two members are defined on [0, 1/2) and [1/2, 1) by rescaling the mother wavelet to these two intervals. In general, at level \(j\) \[ h_n(x) = 2^{j/2} w(2^j x - k), \] where \(n = 2^j + k\) and \(0 \le k \le 2^j\).
library(wavethresh)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:sm':
##
## muscle
## The following object is masked from 'package:dplyr':
##
## select
## WaveThresh: R wavelet software, release 4.6.8, installed
## Copyright Guy Nason and others 1993-2016
## Note: nlevels has been renamed to nlevelsWT
wds <- wd(exa$y, filter.number = 1, family = "DaubExPhase")
draw(wds)
plot(wds)
## [1] 6.12868 6.12868 6.12868 6.12868 6.12868 6.12868 6.12868 6.12868
Let’s only retain 3 levels of coefficients.
wtd <- threshold(wds, policy = "manual", value = 9999)
fd <- wr(wtd)
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa)
lines(fd ~ x, exa, lty = 5, lwd = 2)
Or we can zero out only the small coefficients.
wtd2 <- threshold(wds)
fd2 <- wr(wtd2)
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa)
lines(fd2 ~ x, exa, lty = 5, lwd = 2)
wds <- wd(exa$y, filter.number = 2, bc = "interval")
draw(filter.number = 2, family = "DaubExPhase")
plot(wds)
## [1] 3.3061 3.3061 3.3061 3.3061 3.3061 3.3061 3.3061 3.3061
Now we zero out small coefficients.
wtd <- threshold(wds)
fd <- wr(wtd)
## Warning in wr.wd(wtd): All optional arguments ignored for "wavelets on the
## interval" transform
plot(y ~ x, exa, col = gray(0.75))
lines(m ~ x, exa)
lines(fd ~ x, exa, lty=2)
x <- with(faithful, (eruptions - min(eruptions)) / (max(eruptions) - min(eruptions)))
gridof <- makegrid(x, faithful$waiting)
wdof <- irregwd(gridof, bc="symmetric")
wtof <- threshold(wdof)
wrof <- wr(wtof)
plot(waiting ~ eruptions, faithful, col = grey(0.75))
with(faithful, lines(seq(min(eruptions), max(eruptions), len=512), wrof))